% The second step of data processing, before this step, you need to
% analysis the 5' gene expression by Seurat4 and get the output of Seurat
% metadata.txt(it contains the cell information,cluster information)

% In this process,you need 3 major input file:
%1. pcdh_result_UMI.csv  this file is the 1 step output
%2. metadata3_neuron.txt, this file is the output of Seurat4
%3. barcodes_clearend.xlsx, this is a file contains the cell barcode(with out'-1' at end)

% The output of this script is:
%1.metadata3_neuron_remove.txt, it contains the cells whether cut or not,it can be link to Seurat
%2.output\414_neuron_UMI10.csv and related figs, it describe the stochastic expression level of pcdhg


clear
pcdh=readtable('pcdh_result_UMI.csv','ReadVariableNames',1);
pcdh=table2cell(pcdh);
pcdh=cell2mat(pcdh);
pcdhg=pcdh';
%pcdhg=pcdhg';

%
meta_data=readtable('metadata3_neuron.txt','ReadVariableNames',0);
barcodes=meta_data(:,1);
barcodes_cell=table2cell(barcodes);
barcodes_mat=cell2mat(barcodes_cell);
barcodes_mat(:,[17,18])=[];%delete the last 2 char'-1'
size_barcodes=size(barcodes_mat);
barcodes_rows=zeros(size_barcodes(1),1)+1;
barcodes_cell=mat2cell(barcodes_mat,barcodes_rows,[16]);
clear barcodes_rows

% get the cluster information
cluster_1=meta_data(:,size(meta_data,2));
cluster_1=table2array(cluster_1);
cluster_1=char(cluster_1);
cluster_1=str2num(cluster_1);
cluster_copy=cluster_1;

% cellranger input and align the barcodes

cellranger_barcodes=readtable('barcodes_clearend.xlsx','ReadVariableNames',false);
cellranger_barcodes_cell=table2cell(cellranger_barcodes);
%cellranger_barcodes_mat=cell2mat(cellranger_barcodes_cell);

[barcodes_common,a,b]=intersect(barcodes_cell,cellranger_barcodes_cell);
barcode_shift=[a,b,cluster_1(a)];%the 3rd column is the cluster
pcdhg=pcdhg(:,b);
pcdhg(23,:)=cluster_1(a);
clear barcodes_common a b

%
%seprated into clusters
t_cluster=tabulate(cluster_1);%calculate the cluster distribution
source_cluster_num=max(cluster_1);


%%

%set the cutoff
cutoff=10;
for i=1:3
cell_UMI_cutoff=[1,5,10];%the number can be changed by the usage
pcdhg_cutoff=ceil(floor(pcdhg(1:22,:)./(cutoff+0.01)).*(cutoff+0.01));
counts_in_cell=sum(pcdhg_cutoff,1);
pcdhg_cutoff(23,:)=pcdhg(23,:);
remove_cell_num=find(counts_in_cell<=cell_UMI_cutoff(i));
pcdhg_cutoff(:,remove_cell_num)=[];
pcdhg_cutoff_bi=pcdhg_cutoff;
pcdhg_cutoff_bi(1:22,:)=ceil(pcdhg_cutoff(1:22,:)./10000);%binaryzation
each_gene_counts(i,:)=sum(pcdhg_cutoff_bi(1:22,:)')./size(pcdhg_cutoff_bi,2);
end
%%
%update the metadata file, it can be further input to Seurat and get
%further analysis
remove_cell_tag=zeros(size(meta_data,1),1);
remove_cell_tag(remove_cell_num)=1;
remove_cell_tag=array2table(remove_cell_tag,'VariableNames',{'remove_cell_tag'});
meta_data(:,size(meta_data,2)+1)=remove_cell_tag;
writetable(meta_data,'metadata3_neuron_remove.txt');
%%
%cluster 12 is not typical neurons, we dropped it off
cluster_num=source_cluster_num-1;
notneuron=12;
for i=notneuron
pcdhg_cutoff_bi(:,find(pcdhg_cutoff_bi(23,:)==i))=[];
pcdhg_cutoff_bi(23,find(pcdhg_cutoff_bi(23,:)>i))=pcdhg_cutoff_bi(23,find(pcdhg_cutoff_bi(23,:)>i))-1;
end

%%
%need the function fpcdh_distribution
%here contains the code of binaryzation, in fact, it has been binaryzed before

Labels=[0,1,2,3,4,5,6,7,8,9,10,11,13,14];%cluster 12 was dropped off

pcdhg19_bi=[pcdhg_cutoff_bi(1:19,:);pcdhg_cutoff_bi(23,:)];

[final_result,expected_mat,expected_mat_split,image]=fpcdh_distribution2(pcdhg19_bi,barcode_shift,3,5,100,Labels);
writetable(final_result,'output\414_neuron_UMI10.csv');
saveas(image,'output\414_neuron_distribution','fig');
saveas(image,'output\414_neuron_distribution','emf');
close all
expected_mat2=[];
%100 times shuffle
for i=1:size(expected_mat,2)-1
    add=expected_mat{i};
    add(23,:)=repmat(i-1,size(add,2),1);
    expected_mat2=[expected_mat2,add];
end

for j=1:size(expected_mat_split,2)
    expected_mat3=[];
    for i=1:size(expected_mat_split,1)-1
        add=expected_mat_split{i,j};
        add(23,:)=repmat(i-1,size(add,2),1);
        expected_mat3=[expected_mat3,add];
    end
    expected_mat_split2{j}=expected_mat3;
end


